Lec 33: Choropleth Maps (part 1)

Published

November 15, 2024

# packages
library(tidyverse)     # ecosystem of data science packages
library(rvest)         # for web scraping
library(sf)            # "simple features" (classes for maps)
library(rnaturalearth) # maps database
library(leaflet)       # web interactive maps
library(usmap)         # map of US states (includes Alaska and Hawaii)

Motivation: Forbes’ article “Average Salary by State in 2024”

The data visualization of this module is based on the choropleth map that appears in the following article (by Belle Wong) on Forbes:

Average Salary by State in 2024

https://www.forbes.com/advisor/business/average-salary-by-state/

Data

Average Salary by State data is available in the above Forbes’ webpage as an HTML table. Unfortunately, this HTML table is not scrapable.

The good news is the same data, plus the median salary, can be found in the following SoFi webpage:

https://www.sofi.com/learn/content/average-salary-in-us/

Data from SoFi

We can scrape the HTML table available in the SoFi webpage:

# Average US Salary by State (by Jacqueline DeMarco, 03/28/24)
url = "https://www.sofi.com/learn/content/average-salary-in-us/"

tbls = read_html(url) |>
  html_table()

wages = tbls[[1]]

# data with average and median salaries
wages = wages |>
  mutate(Average = as.numeric(str_remove_all(Average, "[,$]")),
         Median = as.numeric(str_remove_all(Median, "[,$]")))

Graphs: Choropleth Maps

Let’s go over a series of plotting rounds, starting with a basic map, and then gradually adding more elements, and customizing its appearance to get as close as possible to our target visualization.

Map of USA from "rnaturalearth"

We begin by producing a first map of the U.S. This requires to get map data of the US territory—at the State level—via ne_states(). Notice that the returnclass argument must be set as returnclass = "sf" to guarantee that the return map object is of class "sf" (simple features).

# map data
usa_map = ne_states(
  country = "united states of america", 
  returnclass = "sf")

class(usa_map)
[1] "sf"         "data.frame"

As you can tell, the object usa_map is an object of class "sf". But it is also a data.frame object, i.e. a table. It contains a large number of columns. The most important columns—for our exploratory graphing purposes—are name (name of State), and the latitude-longitude coordinates in geometry.

names(usa_map)
  [1] "featurecla" "scalerank"  "adm1_code"  "diss_me"    "iso_3166_2"
  [6] "wikipedia"  "iso_a2"     "adm0_sr"    "name"       "name_alt"  
 [11] "name_local" "type"       "type_en"    "code_local" "code_hasc" 
 [16] "note"       "hasc_maybe" "region"     "region_cod" "provnum_ne"
 [21] "gadm_level" "check_me"   "datarank"   "abbrev"     "postal"    
 [26] "area_sqkm"  "sameascity" "labelrank"  "name_len"   "mapcolor9" 
 [31] "mapcolor13" "fips"       "fips_alt"   "woe_id"     "woe_label" 
 [36] "woe_name"   "latitude"   "longitude"  "sov_a3"     "adm0_a3"   
 [41] "adm0_label" "admin"      "geonunit"   "gu_a3"      "gn_id"     
 [46] "gn_name"    "gns_id"     "gns_name"   "gn_level"   "gn_region" 
 [51] "gn_a1_code" "region_sub" "sub_code"   "gns_level"  "gns_lang"  
 [56] "gns_adm1"   "gns_region" "min_label"  "max_label"  "min_zoom"  
 [61] "wikidataid" "name_ar"    "name_bn"    "name_de"    "name_en"   
 [66] "name_es"    "name_fr"    "name_el"    "name_hi"    "name_hu"   
 [71] "name_id"    "name_it"    "name_ja"    "name_ko"    "name_nl"   
 [76] "name_pl"    "name_pt"    "name_ru"    "name_sv"    "name_tr"   
 [81] "name_vi"    "name_zh"    "ne_id"      "name_he"    "name_uk"   
 [86] "name_ur"    "name_fa"    "name_zht"   "FCLASS_ISO" "FCLASS_US" 
 [91] "FCLASS_FR"  "FCLASS_RU"  "FCLASS_ES"  "FCLASS_CN"  "FCLASS_TW" 
 [96] "FCLASS_IN"  "FCLASS_NP"  "FCLASS_PK"  "FCLASS_DE"  "FCLASS_GB" 
[101] "FCLASS_BR"  "FCLASS_IL"  "FCLASS_PS"  "FCLASS_SA"  "FCLASS_EG" 
[106] "FCLASS_MA"  "FCLASS_PT"  "FCLASS_AR"  "FCLASS_JP"  "FCLASS_KO" 
[111] "FCLASS_VN"  "FCLASS_TR"  "FCLASS_ID"  "FCLASS_PL"  "FCLASS_GR" 
[116] "FCLASS_IT"  "FCLASS_NL"  "FCLASS_SE"  "FCLASS_BD"  "FCLASS_UA" 
[121] "FCLASS_TLC" "geometry"  

With the "sf' object in place, we can pass it to ggplot() and its layer function geom_sf(). To get a map of the contiguous states, we adjust the latitude-longtiude coordinates with coord_sf():

# basic map of USA (contiguous states)
ggplot() +
  geom_sf(data = usa_map) +
  coord_sf(xlim = c(-123, -65), ylim = c(25, 50))

Merging map-data with salary-data

So far we have a map of the US States. But what about the salary data? We need to merge (join) the table of the map usa_map with the table of average salary by state wages. This merging operation is done with inner_join(), indicating the names of the columns in both tables that have the States: name from table usa_map, and State from table wages

# merging map and wages data tables
usa_map_wages = inner_join(usa_map, wages, by = c("name" = "State"))

Version 1

With the merged data usa_map_wages, we can plot again a map of the US contiguous states, color coding the states with the Average values:

# version 1
usa_map_wages |>
  ggplot() +
  geom_sf(aes(fill = Average)) +
  coord_sf(xlim = c(-123, -65), ylim = c(25, 50))

# another version with: scale_fill_binned(type = "gradient")
ggplot() +
  geom_sf(data = usa_map_wages, 
          aes(fill = Average)) +
  coord_sf(xlim = c(-123, -65), ylim = c(25, 50)) +
  theme_minimal() +
  scale_fill_binned(type = "gradient")

Version 2

We can try another fill scale such as "viridis"

# viridis color
ggplot() +
  geom_sf(data = usa_map_wages, 
          aes(fill = Average)) +
  coord_sf(xlim = c(-123, -65), ylim = c(25, 50)) +
  theme_minimal() +
  scale_fill_binned(type = "viridis")

Version 3

We need to change the colors in order to follow the scheme used in the choropleth map published in Forbes:

# version 2
usa_map_wages |>
  ggplot() +
  geom_sf(aes(fill = Average)) +
  scale_fill_gradient(low = "#0B057A", high = "#E6E6F1") +
  coord_sf(xlim = c(-123, -65), ylim = c(25, 50))

Version 4

What if you want your own salary bins? This involves categorizing salary values into different brackets, for example:

  • average salary greater than or equal to 70,000
  • average salary between 60,000 and 70,000
  • average salary between 50,000 and 60,000
  • average salary less than 50,000

Also, for sake of practice, we can change the color scheme (to pink):

# version 4
inner_join(usa_map, wages, by = c("name" = "State")) |>
  mutate(Cutoffs = case_when(
    Average >= 70000 ~ ">=70K",
    Average >= 60000 ~ "60K-70K",
    Average >= 50000 ~ "50K-60K",
    Average < 50000 ~ "<50K"),
    Cutoffs = factor(
      Cutoffs,
      levels = c("<50K", "50K-60K", "60K-70K", ">=70K"))) |>
  ggplot() +
  geom_sf(aes(fill = Cutoffs)) +
  scale_fill_manual(values = c("#e6e6f1", "#9D9BC9", "#5450a1", "#0b057a")) +
  coord_sf(xlim = c(-123, -65), ylim = c(25, 50))

Map with all States

What if we want to visualize all States (including Alaska and Hawaii) and not just the contiguous territory?

No problem! We can use one of the maps from the package "usmap", namely the map object returned by the function us_map().

The following code chunk starts with the merging operation to join us_map() data with the wages data, and the pipes it to ggplot() and friends. In addition, we are using a Viridis continuous scale color:

# use the data from "us_map()" which is already an "sf" object
us_map() |>
  inner_join(wages, by = c("full" = "State")) |>
  ggplot() +
  geom_sf(aes(fill = Average), color = "white") +
  scale_fill_viridis_c(direction = -1) +
  theme_void()


Interactive Map with Leaflet

The previous maps have been produced with ggplot. They are nice but are static visualizations. Interestingly, we can use the "leaflet" package that allows us graph interactive web maps.

Version 1

Arbitrary colors for each state, using a rainbow() palette:

# Leaflet map
leaflet(data = usa_map_wages) |> 
  addTiles() |>
  setView(lng = -90, lat = 40, zoom = 3) |>
  addPolygons(fillColor = rainbow(10), stroke = FALSE)

Version 2

Color coding the state polygons based on the variable Average, and choosing a "viridis" color palette:

col_pal = colorNumeric(palette = "viridis", 
             domain = usa_map_wages$Average)

leaflet(data = usa_map_wages) |> 
  addTiles() |>
  setView(lng = -90, lat = 40, zoom = 3) |>
  addPolygons(fillColor = ~col_pal(Average), 
              fillOpacity = 0.9,
              stroke = FALSE)

Adding a legend

Adding a legend for colors:

# add legend for colors
leaflet(data = usa_map_wages) |> 
  addTiles() |>
  setView(lng = -90, lat = 40, zoom = 3) |>
  addPolygons(fillColor = ~col_pal(Average), 
              fillOpacity = 0.9,
              stroke = TRUE,
              color = ~col_pal(Average),
              opacity = 1,
              weight = 2,
              label = ~paste0(name, "; ", Average)) |> 
  addLegend(position = "bottomleft",
            pal = col_pal, values = ~Average)

Another version

inner_join(usa_map, wages, by = c("name" = "State")) |>
  mutate(Cutoffs = case_when(
    Average >= 70000 ~ ">=70K",
    Average >= 60000 ~ "60K-70K",
    Average >= 50000 ~ "50K-60K",
    Average < 50000 ~ "<50K")) |>
  mutate(Color = case_when(
    Cutoffs == "<50K" ~ "#e6e6f1",
    Cutoffs == "50K-60K" ~ "#9D9BC9",
    Cutoffs == "60K-70K" ~ "#5450a1",
    Cutoffs == ">=70K" ~ "#0b057a"
  )) |>
  mutate(Label = paste0(name, ", $", Average)) |>
leaflet() |>
  addTiles() |>
  setView(lng = -100, 
          lat = 40, 
          zoom = 4) |>
  addPolygons(color = ~Color,
              fillColor = ~Color, 
              fillOpacity = 0.8, 
              label = ~Label)

Another Map: dragging FALSE

leaflet(data = usa_map_wages,
        options = leafletOptions(dragging = FALSE),
        height = 550, width = 780) |> 
  addTiles() |>
  setView(lng = -94, lat = 40, zoom = 4) |>
  addPolygons(fillColor = ~col_pal(Average), 
              fillOpacity = 0.9,
              stroke = FALSE,
              label = ~paste0(name, ", $", Average))